#### Bootstrap boosted linear model ####

source("./code/loadPackages.R") # Install and load all necessary packages
source("./code/getAdmin.R")     # Function to identify administration by date

## Import the FPLP survey data
fpls = fread("./data/fpls1976.csv")
fpls$birthDecade = as.factor(fpls$birthDecade)

## Import the core data regarding decision-makers
act = fread("./data/actorData.csv")
act$birthDecade = factor(act$birthDecade)

#### Set up predictive models ####
niter = 1000
glmBoostGridF = expand.grid(mstop = c(150, 250, 350, 450),
                            prune = c('no'))
lmBoostGrid = expand.grid(intercept = TRUE)

fitcontrolF = trainControl(method="boot", number=5, verboseIter = F, 
                           savePredictions = T) 

#### Run the models, 1000 iterations ####
set.seed(091620)
outPred = outResampTrain = outResampTest = outCoef = outPredTest = outTestError = outTestActual = list()
outPred2 = outResampTrain2 = outResampTest2 = outCoef2 = outPredTest2 = outTestError2 = list()
outPred3 = outResampTrain3 = outResampTest3 = outCoef3 = outPredTest3 = outTestError3 = list()

for (i in 1:niter) {
  # Get training and test data
  train = sample(1:nrow(fpls), 1*nrow(fpls), replace=T)
  trainSet = fpls[train,]
  testSet = fpls[-unique(train),]
  
  # Do predictions with all covariates
  fitOne = train(MI1 ~ male + birthDecade + 
                   CollegeGrad + MA + MBA + LLBJD + MD + PhD + 
                   MilService + FSO + wwii + korea + vietnam + MilOfficer + Rep + Dem, 
                 data=trainSet, method="glmboost", tuneGrid=glmBoostGridF,
                 trControl=fitcontrolF, na.action=na.omit,
                 metric="Rsquared")
  
  prs = predict(fitOne, newdata = testSet)
  outPredTest[[i]] = data.frame(id=testSet$id, pred=prs)
  outResampTrain[[i]] = fitOne$resample
  outResampTest[[i]] = postResample(prs, testSet$MI1)
  outPred[[i]] = predict(fitOne, act)
  outCoef[[i]] = data.frame(t(coef(fitOne$finalModel)))
  outTestError[[i]] = predict(fitOne, testSet) - testSet$MI1
  outTestActual[[i]] = data.frame(id=testSet$id, actual=testSet$MI1)
  
  # Remove bureaucratic affiliations
  fitTwo = train(MI1 ~ male + birthDecade + 
                   CollegeGrad + MA + MBA + LLBJD + MD + PhD + 
                   MilService + wwii + korea + vietnam + Rep + Dem, 
                 data=trainSet, method="glmboost", na.action=na.omit, tuneGrid=glmBoostGridF,
                 trControl=fitcontrolF, metric="Rsquared")
  
  prs2 = predict(fitTwo, newdata = testSet)
  outPredTest2[[i]] = data.frame(id=testSet$id, pred=prs2)
  outResampTrain2[[i]] = fitTwo$resample
  outResampTest2[[i]] = postResample(prs2, testSet$MI1)
  outPred2[[i]] = predict(fitTwo, act)
  outCoef2[[i]] = data.frame(t(coef(fitTwo$finalModel)))
  outTestError2[[i]] = predict(fitTwo, testSet) - testSet$MI1
  
  # Linear model
  fitThree = train(MI1 ~ male + birthDecade + 
                     CollegeGrad + MA + MBA + LLBJD + MD + PhD + 
                     MilService + FSO + wwii + korea + vietnam + MilOfficer + Rep + Dem, 
                   data=trainSet, method="lm", na.action=na.omit, 
                   trControl=fitcontrolF, metric="Rsquared")
  
  prs3 = predict(fitThree, newdata = testSet)
  outPredTest3[[i]] = data.frame(id=testSet$id, pred=prs3)
  outResampTrain3[[i]] = fitThree$resample
  outResampTest3[[i]] = postResample(prs3, testSet$MI1)
  outPred3[[i]] = predict(fitThree, act)
  outCoef3[[i]] = data.frame(t(coef(fitThree$finalModel)))
  outTestError3[[i]] = predict(fitThree, testSet) - testSet$MI1
  
  if (i %% 25 == 0) {message(paste("Iteration", i, "of", niter, "complete.", sep=" "))}
}

## Compile other bootstrap results
resampTrain = rbindlist(outResampTrain)
resampTest = do.call("rbind", outResampTest)
coefs = rbindlist(outCoef, fill=TRUE)
testPreds = rbindlist(outPredTest)
errors = unlist(outTestError)

resampTrain2 = rbindlist(outResampTrain2)
resampTest2 = do.call("rbind", outResampTest2)
coefs2 = rbindlist(outCoef2, fill=TRUE)
testPreds2 = rbindlist(outPredTest2)
errors2 = unlist(outTestError2)

resampTrain3 = rbindlist(outResampTrain3)
resampTest3 = do.call("rbind", outResampTest3)
coefs3 = rbindlist(outCoef3, fill=TRUE)
testPreds3 = rbindlist(outPredTest3)
errors3 = unlist(outTestError3)

#### Save all the metrics for subsequent analysis ####
save.image("./data/predictHawkMetrics.RData")

# Clear the environment
rm(list = ls()) 